{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import codecs\n",
    "\n",
    "import jieba as jb #分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label                                               text\n",
       "0      2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...\n",
       "1      2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
       "2      1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...\n",
       "3      2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...\n",
       "4      2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专..."
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#path to where the data lies\n",
    "dpath = './data/'\n",
    "train = pd.read_csv(dpath + \"training.csv\", header=None, encoding='utf8')\n",
    "train.columns = ['label', 'text']\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读停用词表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_file(file_path):\n",
    "    f = codecs.open(file_path, mode='r', encoding='utf-8')\n",
    "    lines = []\n",
    "    for line in f:\n",
    "        line = line.rstrip('\\n').rstrip('\\r')\n",
    "        lines.append(line)\n",
    "    return lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['!',\n",
       " '\"',\n",
       " '#',\n",
       " '$',\n",
       " '%',\n",
       " '&',\n",
       " \"'\",\n",
       " '(',\n",
       " ')',\n",
       " '*',\n",
       " '+',\n",
       " ',',\n",
       " '-',\n",
       " '--',\n",
       " '.',\n",
       " '..',\n",
       " '...',\n",
       " '......',\n",
       " '...................',\n",
       " './',\n",
       " '.一',\n",
       " '.数',\n",
       " '.日',\n",
       " '/',\n",
       " '//',\n",
       " '0',\n",
       " '1',\n",
       " '2',\n",
       " '3',\n",
       " '4',\n",
       " '5',\n",
       " '6',\n",
       " '7',\n",
       " '8',\n",
       " '9',\n",
       " ':',\n",
       " '://',\n",
       " '::',\n",
       " ';',\n",
       " '<',\n",
       " '=',\n",
       " '>',\n",
       " '>>',\n",
       " '?',\n",
       " '@',\n",
       " 'A',\n",
       " 'Lex',\n",
       " '[',\n",
       " '\\\\',\n",
       " ']',\n",
       " '^',\n",
       " '_',\n",
       " '`',\n",
       " 'exp',\n",
       " 'sub',\n",
       " 'sup',\n",
       " '|',\n",
       " '}',\n",
       " '~',\n",
       " '~~~~',\n",
       " '·',\n",
       " '×',\n",
       " '×××',\n",
       " 'Δ',\n",
       " 'Ψ',\n",
       " 'γ',\n",
       " 'μ',\n",
       " 'φ',\n",
       " 'φ．',\n",
       " 'В',\n",
       " '—',\n",
       " '——',\n",
       " '———',\n",
       " '‘',\n",
       " '’',\n",
       " '’‘',\n",
       " '“',\n",
       " '”',\n",
       " '”，',\n",
       " '…',\n",
       " '……',\n",
       " '…………………………………………………③',\n",
       " '′∈',\n",
       " '′｜',\n",
       " '℃',\n",
       " 'Ⅲ',\n",
       " '↑',\n",
       " '→',\n",
       " '∈［',\n",
       " '∪φ∈',\n",
       " '≈',\n",
       " '①',\n",
       " '②',\n",
       " '②ｃ',\n",
       " '③',\n",
       " '③］',\n",
       " '④',\n",
       " '⑤',\n",
       " '⑥',\n",
       " '⑦',\n",
       " '⑧',\n",
       " '⑨',\n",
       " '⑩',\n",
       " '──',\n",
       " '■',\n",
       " '▲',\n",
       " '\\u3000',\n",
       " '、',\n",
       " '。',\n",
       " '〈',\n",
       " '〉',\n",
       " '《',\n",
       " '》',\n",
       " '》），',\n",
       " '」',\n",
       " '『',\n",
       " '』',\n",
       " '【',\n",
       " '】',\n",
       " '〔',\n",
       " '〕',\n",
       " '〕〔',\n",
       " '㈧',\n",
       " '一',\n",
       " '一.',\n",
       " '一一',\n",
       " '一下',\n",
       " '一个',\n",
       " '一些',\n",
       " '一何',\n",
       " '一切',\n",
       " '一则',\n",
       " '一则通过',\n",
       " '一天',\n",
       " '一定',\n",
       " '一方面',\n",
       " '一旦',\n",
       " '一时',\n",
       " '一来',\n",
       " '一样',\n",
       " '一次',\n",
       " '一片',\n",
       " '一番',\n",
       " '一直',\n",
       " '一致',\n",
       " '一般',\n",
       " '一起',\n",
       " '一转眼',\n",
       " '一边',\n",
       " '一面',\n",
       " '七',\n",
       " '万一',\n",
       " '三',\n",
       " '三天两头',\n",
       " '三番两次',\n",
       " '三番五次',\n",
       " '上',\n",
       " '上下',\n",
       " '上升',\n",
       " '上去',\n",
       " '上来',\n",
       " '上述',\n",
       " '上面',\n",
       " '下',\n",
       " '下列',\n",
       " '下去',\n",
       " '下来',\n",
       " '下面',\n",
       " '不',\n",
       " '不一',\n",
       " '不下',\n",
       " '不久',\n",
       " '不了',\n",
       " '不亦乐乎',\n",
       " '不仅',\n",
       " '不仅...而且',\n",
       " '不仅仅',\n",
       " '不仅仅是',\n",
       " '不会',\n",
       " '不但',\n",
       " '不但...而且',\n",
       " '不光',\n",
       " '不免',\n",
       " '不再',\n",
       " '不力',\n",
       " '不单',\n",
       " '不变',\n",
       " '不只',\n",
       " '不可',\n",
       " '不可开交',\n",
       " '不可抗拒',\n",
       " '不同',\n",
       " '不外',\n",
       " '不外乎',\n",
       " '不够',\n",
       " '不大',\n",
       " '不如',\n",
       " '不妨',\n",
       " '不定',\n",
       " '不对',\n",
       " '不少',\n",
       " '不尽',\n",
       " '不尽然',\n",
       " '不巧',\n",
       " '不已',\n",
       " '不常',\n",
       " '不得',\n",
       " '不得不',\n",
       " '不得了',\n",
       " '不得已',\n",
       " '不必',\n",
       " '不怎么',\n",
       " '不怕',\n",
       " '不惟',\n",
       " '不成',\n",
       " '不拘',\n",
       " '不择手段',\n",
       " '不敢',\n",
       " '不料',\n",
       " '不断',\n",
       " '不日',\n",
       " '不时',\n",
       " '不是',\n",
       " '不曾',\n",
       " '不止',\n",
       " '不止一次',\n",
       " '不比',\n",
       " '不消',\n",
       " '不满',\n",
       " '不然',\n",
       " '不然的话',\n",
       " '不特',\n",
       " '不独',\n",
       " '不由得',\n",
       " '不知不觉',\n",
       " '不管',\n",
       " '不管怎样',\n",
       " '不经意',\n",
       " '不胜',\n",
       " '不能',\n",
       " '不能不',\n",
       " '不至于',\n",
       " '不若',\n",
       " '不要',\n",
       " '不论',\n",
       " '不起',\n",
       " '不足',\n",
       " '不过',\n",
       " '不迭',\n",
       " '不问',\n",
       " '不限',\n",
       " '与',\n",
       " '与其',\n",
       " '与其说',\n",
       " '与否',\n",
       " '与此同时',\n",
       " '专门',\n",
       " '且',\n",
       " '且不说',\n",
       " '且说',\n",
       " '两者',\n",
       " '严格',\n",
       " '严重',\n",
       " '个',\n",
       " '个人',\n",
       " '个别',\n",
       " '中小',\n",
       " '中间',\n",
       " '丰富',\n",
       " '串行',\n",
       " '临',\n",
       " '临到',\n",
       " '为',\n",
       " '为主',\n",
       " '为了',\n",
       " '为什么',\n",
       " '为什麽',\n",
       " '为何',\n",
       " '为止',\n",
       " '为此',\n",
       " '为着',\n",
       " '主张',\n",
       " '主要',\n",
       " '举凡',\n",
       " '举行',\n",
       " '乃',\n",
       " '乃至',\n",
       " '乃至于',\n",
       " '么',\n",
       " '之',\n",
       " '之一',\n",
       " '之前',\n",
       " '之后',\n",
       " '之後',\n",
       " '之所以',\n",
       " '之类',\n",
       " '乌乎',\n",
       " '乎',\n",
       " '乒',\n",
       " '乘',\n",
       " '乘势',\n",
       " '乘机',\n",
       " '乘胜',\n",
       " '乘虚',\n",
       " '乘隙',\n",
       " '九',\n",
       " '也',\n",
       " '也好',\n",
       " '也就是说',\n",
       " '也是',\n",
       " '也罢',\n",
       " '了',\n",
       " '了解',\n",
       " '争取',\n",
       " '二',\n",
       " '二来',\n",
       " '二话不说',\n",
       " '二话没说',\n",
       " '于',\n",
       " '于是',\n",
       " '于是乎',\n",
       " '云云',\n",
       " '云尔',\n",
       " '互',\n",
       " '互相',\n",
       " '五',\n",
       " '些',\n",
       " '交口',\n",
       " '亦',\n",
       " '产生',\n",
       " '亲口',\n",
       " '亲手',\n",
       " '亲眼',\n",
       " '亲自',\n",
       " '亲身',\n",
       " '人',\n",
       " '人人',\n",
       " '人们',\n",
       " '人家',\n",
       " '人民',\n",
       " '什么',\n",
       " '什么样',\n",
       " '什麽',\n",
       " '仅',\n",
       " '仅仅',\n",
       " '今',\n",
       " '今后',\n",
       " '今天',\n",
       " '今年',\n",
       " '今後',\n",
       " '介于',\n",
       " '仍',\n",
       " '仍旧',\n",
       " '仍然',\n",
       " '从',\n",
       " '从不',\n",
       " '从严',\n",
       " '从中',\n",
       " '从事',\n",
       " '从今以后',\n",
       " '从优',\n",
       " '从古到今',\n",
       " '从古至今',\n",
       " '从头',\n",
       " '从宽',\n",
       " '从小',\n",
       " '从新',\n",
       " '从无到有',\n",
       " '从早到晚',\n",
       " '从未',\n",
       " '从来',\n",
       " '从此',\n",
       " '从此以后',\n",
       " '从而',\n",
       " '从轻',\n",
       " '从速',\n",
       " '从重',\n",
       " '他',\n",
       " '他人',\n",
       " '他们',\n",
       " '他是',\n",
       " '他的',\n",
       " '代替',\n",
       " '以',\n",
       " '以上',\n",
       " '以下',\n",
       " '以为',\n",
       " '以便',\n",
       " '以免',\n",
       " '以前',\n",
       " '以及',\n",
       " '以后',\n",
       " '以外',\n",
       " '以後',\n",
       " '以故',\n",
       " '以期',\n",
       " '以来',\n",
       " '以至',\n",
       " '以至于',\n",
       " '以致',\n",
       " '们',\n",
       " '任',\n",
       " '任何',\n",
       " '任凭',\n",
       " '任务',\n",
       " '企图',\n",
       " '伙同',\n",
       " '会',\n",
       " '伟大',\n",
       " '传',\n",
       " '传说',\n",
       " '传闻',\n",
       " '似乎',\n",
       " '似的',\n",
       " '但',\n",
       " '但凡',\n",
       " '但愿',\n",
       " '但是',\n",
       " '何',\n",
       " '何乐而不为',\n",
       " '何以',\n",
       " '何况',\n",
       " '何处',\n",
       " '何妨',\n",
       " '何尝',\n",
       " '何必',\n",
       " '何时',\n",
       " '何止',\n",
       " '何苦',\n",
       " '何须',\n",
       " '余外',\n",
       " '作为',\n",
       " '你',\n",
       " '你们',\n",
       " '你是',\n",
       " '你的',\n",
       " '使',\n",
       " '使得',\n",
       " '使用',\n",
       " '例如',\n",
       " '依',\n",
       " '依据',\n",
       " '依照',\n",
       " '依靠',\n",
       " '便',\n",
       " '便于',\n",
       " '促进',\n",
       " '保持',\n",
       " '保管',\n",
       " '保险',\n",
       " '俺',\n",
       " '俺们',\n",
       " '倍加',\n",
       " '倍感',\n",
       " '倒不如',\n",
       " '倒不如说',\n",
       " '倒是',\n",
       " '倘',\n",
       " '倘使',\n",
       " '倘或',\n",
       " '倘然',\n",
       " '倘若',\n",
       " '借',\n",
       " '借以',\n",
       " '借此',\n",
       " '假使',\n",
       " '假如',\n",
       " '假若',\n",
       " '偏偏',\n",
       " '做到',\n",
       " '偶尔',\n",
       " '偶而',\n",
       " '傥然',\n",
       " '像',\n",
       " '儿',\n",
       " '允许',\n",
       " '元／吨',\n",
       " '充其极',\n",
       " '充其量',\n",
       " '充分',\n",
       " '先不先',\n",
       " '先后',\n",
       " '先後',\n",
       " '先生',\n",
       " '光',\n",
       " '光是',\n",
       " '全体',\n",
       " '全力',\n",
       " '全年',\n",
       " '全然',\n",
       " '全身心',\n",
       " '全部',\n",
       " '全都',\n",
       " '全面',\n",
       " '八',\n",
       " '八成',\n",
       " '公然',\n",
       " '六',\n",
       " '兮',\n",
       " '共',\n",
       " '共同',\n",
       " '共总',\n",
       " '关于',\n",
       " '其',\n",
       " '其一',\n",
       " '其中',\n",
       " '其二',\n",
       " '其他',\n",
       " '其余',\n",
       " '其后',\n",
       " '其它',\n",
       " '其实',\n",
       " '其次',\n",
       " '具体',\n",
       " '具体地说',\n",
       " '具体来说',\n",
       " '具体说来',\n",
       " '具有',\n",
       " '兼之',\n",
       " '内',\n",
       " '再',\n",
       " '再其次',\n",
       " '再则',\n",
       " '再有',\n",
       " '再次',\n",
       " '再者',\n",
       " '再者说',\n",
       " '再说',\n",
       " '冒',\n",
       " '冲',\n",
       " '决不',\n",
       " '决定',\n",
       " '决非',\n",
       " '况且',\n",
       " '准备',\n",
       " '凑巧',\n",
       " '凝神',\n",
       " '几',\n",
       " '几乎',\n",
       " '几度',\n",
       " '几时',\n",
       " '几番',\n",
       " '几经',\n",
       " '凡',\n",
       " '凡是',\n",
       " '凭',\n",
       " '凭借',\n",
       " '出',\n",
       " '出于',\n",
       " '出去',\n",
       " '出来',\n",
       " '出现',\n",
       " '分别',\n",
       " '分头',\n",
       " '分期',\n",
       " '分期分批',\n",
       " '切',\n",
       " '切不可',\n",
       " '切切',\n",
       " '切勿',\n",
       " '切莫',\n",
       " '则',\n",
       " '则甚',\n",
       " '刚',\n",
       " '刚好',\n",
       " '刚巧',\n",
       " '刚才',\n",
       " '初',\n",
       " '别',\n",
       " '别人',\n",
       " '别处',\n",
       " '别是',\n",
       " '别的',\n",
       " '别管',\n",
       " '别说',\n",
       " '到',\n",
       " '到了儿',\n",
       " '到处',\n",
       " '到头',\n",
       " '到头来',\n",
       " '到底',\n",
       " '到目前为止',\n",
       " '前后',\n",
       " '前此',\n",
       " '前者',\n",
       " '前进',\n",
       " '前面',\n",
       " '加上',\n",
       " '加之',\n",
       " '加以',\n",
       " '加入',\n",
       " '加强',\n",
       " '动不动',\n",
       " '动辄',\n",
       " '勃然',\n",
       " '匆匆',\n",
       " '十分',\n",
       " '千',\n",
       " '千万',\n",
       " '千万千万',\n",
       " '半',\n",
       " '单',\n",
       " '单单',\n",
       " '单纯',\n",
       " '即',\n",
       " '即令',\n",
       " '即使',\n",
       " '即便',\n",
       " '即刻',\n",
       " '即如',\n",
       " '即将',\n",
       " '即或',\n",
       " '即是说',\n",
       " '即若',\n",
       " '却',\n",
       " '却不',\n",
       " '历',\n",
       " '原来',\n",
       " '去',\n",
       " '又',\n",
       " '又及',\n",
       " '及',\n",
       " '及其',\n",
       " '及时',\n",
       " '及至',\n",
       " '双方',\n",
       " '反之',\n",
       " '反之亦然',\n",
       " '反之则',\n",
       " '反倒',\n",
       " '反倒是',\n",
       " '反应',\n",
       " '反手',\n",
       " '反映',\n",
       " '反而',\n",
       " '反过来',\n",
       " '反过来说',\n",
       " '取得',\n",
       " '取道',\n",
       " '受到',\n",
       " '变成',\n",
       " '古来',\n",
       " '另',\n",
       " '另一个',\n",
       " '另一方面',\n",
       " '另外',\n",
       " '另悉',\n",
       " '另方面',\n",
       " '另行',\n",
       " '只',\n",
       " '只当',\n",
       " '只怕',\n",
       " '只是',\n",
       " '只有',\n",
       " '只消',\n",
       " '只要',\n",
       " '只限',\n",
       " '叫',\n",
       " '叫做',\n",
       " '召开',\n",
       " '叮咚',\n",
       " '叮当',\n",
       " '可',\n",
       " '可以',\n",
       " '可好',\n",
       " '可是',\n",
       " '可能',\n",
       " '可见',\n",
       " '各',\n",
       " '各个',\n",
       " '各人',\n",
       " '各位',\n",
       " '各地',\n",
       " '各式',\n",
       " '各种',\n",
       " '各级',\n",
       " '各自',\n",
       " '合理',\n",
       " '同',\n",
       " '同一',\n",
       " '同时',\n",
       " '同样',\n",
       " '后',\n",
       " '后来',\n",
       " '后者',\n",
       " '后面',\n",
       " '向',\n",
       " '向使',\n",
       " '向着',\n",
       " '吓',\n",
       " '吗',\n",
       " '否则',\n",
       " '吧',\n",
       " '吧哒',\n",
       " '吱',\n",
       " '呀',\n",
       " '呃',\n",
       " '呆呆地',\n",
       " '呐',\n",
       " '呕',\n",
       " '呗',\n",
       " '呜',\n",
       " '呜呼',\n",
       " '呢',\n",
       " '周围',\n",
       " '呵',\n",
       " '呵呵',\n",
       " '呸',\n",
       " '呼哧',\n",
       " '呼啦',\n",
       " '咋',\n",
       " '和',\n",
       " '咚',\n",
       " '咦',\n",
       " '咧',\n",
       " '咱',\n",
       " '咱们',\n",
       " '咳',\n",
       " '哇',\n",
       " '哈',\n",
       " '哈哈',\n",
       " '哉',\n",
       " '哎',\n",
       " '哎呀',\n",
       " '哎哟',\n",
       " '哗',\n",
       " '哗啦',\n",
       " '哟',\n",
       " '哦',\n",
       " '哩',\n",
       " '哪',\n",
       " '哪个',\n",
       " '哪些',\n",
       " '哪儿',\n",
       " '哪天',\n",
       " '哪年',\n",
       " '哪怕',\n",
       " '哪样',\n",
       " '哪边',\n",
       " '哪里',\n",
       " '哼',\n",
       " '哼唷',\n",
       " '唉',\n",
       " '唯有',\n",
       " '啊',\n",
       " '啊呀',\n",
       " '啊哈',\n",
       " '啊哟',\n",
       " '啐',\n",
       " '啥',\n",
       " '啦',\n",
       " '啪达',\n",
       " '啷当',\n",
       " '喀',\n",
       " '喂',\n",
       " '喏',\n",
       " '喔唷',\n",
       " '喽',\n",
       " '嗡',\n",
       " '嗡嗡',\n",
       " '嗬',\n",
       " '嗯',\n",
       " '嗳',\n",
       " '嘎',\n",
       " '嘎嘎',\n",
       " '嘎登',\n",
       " '嘘',\n",
       " '嘛',\n",
       " '嘻',\n",
       " '嘿',\n",
       " '嘿嘿',\n",
       " '四',\n",
       " '因',\n",
       " '因为',\n",
       " '因了',\n",
       " '因此',\n",
       " '因着',\n",
       " '因而',\n",
       " '固',\n",
       " '固然',\n",
       " '在',\n",
       " '在下',\n",
       " '在于',\n",
       " '地',\n",
       " '均',\n",
       " '坚决',\n",
       " '坚持',\n",
       " '基于',\n",
       " '基本',\n",
       " '基本上',\n",
       " '处在',\n",
       " '处处',\n",
       " '处理',\n",
       " '复杂',\n",
       " '多',\n",
       " '多么',\n",
       " '多亏',\n",
       " '多多',\n",
       " '多多少少',\n",
       " '多多益善',\n",
       " '多少',\n",
       " '多年前',\n",
       " '多年来',\n",
       " '多数',\n",
       " '多次',\n",
       " '够瞧的',\n",
       " '大',\n",
       " '大不了',\n",
       " '大举',\n",
       " '大事',\n",
       " '大体',\n",
       " '大体上',\n",
       " '大凡',\n",
       " '大力',\n",
       " '大多',\n",
       " '大多数',\n",
       " '大大',\n",
       " '大家',\n",
       " '大张旗鼓',\n",
       " '大批',\n",
       " '大抵',\n",
       " '大概',\n",
       " '大略',\n",
       " '大约',\n",
       " '大致',\n",
       " '大都',\n",
       " '大量',\n",
       " '大面儿上',\n",
       " '失去',\n",
       " '奇',\n",
       " '奈',\n",
       " '奋勇',\n",
       " '她',\n",
       " '她们',\n",
       " '她是',\n",
       " '她的',\n",
       " '好',\n",
       " '好在',\n",
       " '好的',\n",
       " '好象',\n",
       " '如',\n",
       " '如上',\n",
       " '如上所述',\n",
       " '如下',\n",
       " '如今',\n",
       " '如何',\n",
       " '如其',\n",
       " '如前所述',\n",
       " '如同',\n",
       " '如常',\n",
       " '如是',\n",
       " '如期',\n",
       " '如果',\n",
       " '如次',\n",
       " '如此',\n",
       " '如此等等',\n",
       " '如若',\n",
       " '始而',\n",
       " '姑且',\n",
       " '存在',\n",
       " '存心',\n",
       " '孰料',\n",
       " '孰知',\n",
       " '宁',\n",
       " '宁可',\n",
       " '宁愿',\n",
       " '宁肯',\n",
       " '它',\n",
       " '它们',\n",
       " '它们的',\n",
       " '它是',\n",
       " '它的',\n",
       " '安全',\n",
       " '完全',\n",
       " '完成',\n",
       " '定',\n",
       " '实现',\n",
       " '实际',\n",
       " '宣布',\n",
       " '容易',\n",
       " '密切',\n",
       " '对',\n",
       " '对于',\n",
       " '对应',\n",
       " '对待',\n",
       " '对方',\n",
       " '对比',\n",
       " '将',\n",
       " '将才',\n",
       " '将要',\n",
       " '将近',\n",
       " '小',\n",
       " '少数',\n",
       " '尔',\n",
       " '尔后',\n",
       " '尔尔',\n",
       " '尔等',\n",
       " '尚且',\n",
       " '尤其',\n",
       " '就',\n",
       " '就地',\n",
       " '就是',\n",
       " '就是了',\n",
       " '就是说',\n",
       " '就此',\n",
       " '就算',\n",
       " '就要',\n",
       " '尽',\n",
       " '尽可能',\n",
       " '尽如人意',\n",
       " '尽心尽力',\n",
       " '尽心竭力',\n",
       " '尽快',\n",
       " '尽早',\n",
       " '尽然',\n",
       " '尽管',\n",
       " '尽管如此',\n",
       " '尽量',\n",
       " '局外',\n",
       " '居然',\n",
       " '届时',\n",
       " '属于',\n",
       " '屡',\n",
       " '屡屡',\n",
       " '屡次',\n",
       " '屡次三番',\n",
       " '岂',\n",
       " '岂但',\n",
       " '岂止',\n",
       " '岂非',\n",
       " '川流不息',\n",
       " '左右',\n",
       " '巨大',\n",
       " '巩固',\n",
       " '差一点',\n",
       " '差不多',\n",
       " '己',\n",
       " '已',\n",
       " '已矣',\n",
       " '已经',\n",
       " '巴',\n",
       " '巴巴',\n",
       " '带',\n",
       " '帮助',\n",
       " '常',\n",
       " '常常',\n",
       " '常言说',\n",
       " '常言说得好',\n",
       " '常言道',\n",
       " '平素',\n",
       " '年复一年',\n",
       " '并',\n",
       " '并不',\n",
       " '并不是',\n",
       " '并且',\n",
       " '并排',\n",
       " '并无',\n",
       " '并没',\n",
       " '并没有',\n",
       " '并肩',\n",
       " '并非',\n",
       " '广大',\n",
       " '广泛',\n",
       " '应当',\n",
       " '应用',\n",
       " '应该',\n",
       " '庶乎',\n",
       " '庶几',\n",
       " '开外',\n",
       " '开始',\n",
       " '开展',\n",
       " '引起',\n",
       " '弗',\n",
       " '弹指之间',\n",
       " '强烈',\n",
       " '强调',\n",
       " '归',\n",
       " '归根到底',\n",
       " '归根结底',\n",
       " '归齐',\n",
       " '当',\n",
       " '当下',\n",
       " '当中',\n",
       " '当儿',\n",
       " '当前',\n",
       " '当即',\n",
       " '当口儿',\n",
       " '当地',\n",
       " '当场',\n",
       " '当头',\n",
       " '当庭',\n",
       " '当时',\n",
       " '当然',\n",
       " '当真',\n",
       " '当着',\n",
       " '形成',\n",
       " '彻夜',\n",
       " '彻底',\n",
       " '彼',\n",
       " '彼时',\n",
       " ...]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords = read_file(dpath + \"stopwords.txt\")\n",
    "stopwords"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def segment_text(each_row):\n",
    "    return ' '.join([word for word in jb.lcut(each_row['text']) if word not in stopwords])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache C:\\Users\\home\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.698 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "train['text_segmentation'] = train.apply(segment_text, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "      <th>text_segmentation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "      <td>合晟 资产 一家 专注 股票 债券 二级 市场 投资 合格 投资者 提供 专业 资产 管理 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "      <td>公司 主营业务 微 企业 个体 工商户 农户 客户 提供 贷款 服务 设立 主营业务 未 发...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "      <td>公司 立足于 商业地产 服务 致力于 商业地产 开发 销售 运营 全 产业链 提供 一整套 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "      <td>公司 工商管理 部门 核准 经营范围 投资 咨询 经济 信息 咨询 企业 管理 咨询 品牌 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "      <td>公司 主营业务 中国 境内 港 澳 台 保险代理 销售 依托 产品 研究 能力 专业化 服务...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label                                               text  \\\n",
       "0      2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...   \n",
       "1      2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。   \n",
       "2      1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...   \n",
       "3      2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...   \n",
       "4      2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...   \n",
       "\n",
       "                                   text_segmentation  \n",
       "0  合晟 资产 一家 专注 股票 债券 二级 市场 投资 合格 投资者 提供 专业 资产 管理 ...  \n",
       "1  公司 主营业务 微 企业 个体 工商户 农户 客户 提供 贷款 服务 设立 主营业务 未 发...  \n",
       "2  公司 立足于 商业地产 服务 致力于 商业地产 开发 销售 运营 全 产业链 提供 一整套 ...  \n",
       "3  公司 工商管理 部门 核准 经营范围 投资 咨询 经济 信息 咨询 企业 管理 咨询 品牌 ...  \n",
       "4  公司 主营业务 中国 境内 港 澳 台 保险代理 销售 依托 产品 研究 能力 专业化 服务...  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 词频特征/TFIDF特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>5816</th>\n",
       "      <th>5817</th>\n",
       "      <th>5818</th>\n",
       "      <th>5819</th>\n",
       "      <th>5820</th>\n",
       "      <th>5821</th>\n",
       "      <th>5822</th>\n",
       "      <th>5823</th>\n",
       "      <th>5824</th>\n",
       "      <th>5825</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 5826 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0     1     2     3     4     5     6     7     8     9     ...  5816  \\\n",
       "0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   \n",
       "1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   \n",
       "2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   \n",
       "3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   \n",
       "4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   \n",
       "\n",
       "   5817  5818  5819  5820  5821  5822  5823  5824  5825  \n",
       "0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "\n",
       "[5 rows x 5826 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#先后调用CountVectorizer和TfidfTransformer两种方法(简化了代码，但运算思想不变)\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "tfidf = TfidfVectorizer(min_df=5)\n",
    "#输出稀疏矩阵\n",
    "train_tfidf = tfidf.fit_transform(train['text_segmentation']).toarray()\n",
    "\n",
    "# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
    "# vectorizer = CountVectorizer(min_df=5)\n",
    "# transformer = TfidfTransformer()\n",
    "# train_tfidf = transformer.fit_transform(vectorizer.fit_transform(train['text_segmentation']))\n",
    "\n",
    "#重新组成DataFrame,为了可视化\n",
    "df_train_tfidf = pd.DataFrame(data = train_tfidf)\n",
    "\n",
    "df_train_tfidf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>5817</th>\n",
       "      <th>5818</th>\n",
       "      <th>5819</th>\n",
       "      <th>5820</th>\n",
       "      <th>5821</th>\n",
       "      <th>5822</th>\n",
       "      <th>5823</th>\n",
       "      <th>5824</th>\n",
       "      <th>5825</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 5827 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1    2    3    4    5    6    7    8    9  ...  5817  5818  5819  \\\n",
       "0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   \n",
       "1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   \n",
       "2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   \n",
       "3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   \n",
       "4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   \n",
       "\n",
       "   5820  5821  5822  5823  5824  5825  label  \n",
       "0   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "1   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "2   0.0   0.0   0.0   0.0   0.0   0.0      1  \n",
       "3   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "4   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "\n",
       "[5 rows x 5827 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_tfidf['label'] = train['label']\n",
    "df_train_tfidf.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_tfidf.to_csv(dpath + 'FE_train_tfidf.csv', index=False, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
